Section 2: Intro to Plotting with ggplot

Author

Milo Coolman

Section 2.2: Basic Plot Structure

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
stat113_df <- read_csv("https://raw.githubusercontent.com/highamm/ds234_quarto/main/data_online/stat113.csv")
Rows: 397 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Year, Sex, Sport, Award, SocialMedia
dbl (7): Hgt, Wgt, Haircut, GPA, Exercise, TV, Pulse

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(data = stat113_df, aes(x = Exercise)) +
  geom_histogram(colour = "gray1", fill = "gold", bins = 17)
Warning: Removed 7 rows containing non-finite outside the scale range
(`stat_bin()`).

ggplot(data = stat113_df, aes(x = GPA)) +
  geom_freqpoly(bins = 15)
Warning: Removed 70 rows containing non-finite outside the scale range
(`stat_bin()`).

Exercises 4-6

ggplot(data = stat113_df, aes(x = Award)) +
  geom_bar(colour = "gray1", fill = "gold")

ggplot(data = stat113_df, aes(x = Year, fill = Sex)) +
  geom_bar(position = "fill") +
  labs(y = "Proportion")

Exercises 7-8

#7 Freshman and Sophomores have a much larger range of weights than Juniors and Seniors, but on Average most people are between 135-170lbs

ggplot(data = stat113_df, aes(x = Year, y = Wgt)) +
  geom_violin(colour = "gray1", fill = "gold")
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_ydensity()`).

Exercises 9-11

#9 There are multiple points that show that you were moving, however the active_cals are listed as 0 which should not be possible. #10 It seems as though most of your steps throughout the day are on flat surfaces, and not on exercise machinery such as a stairmaster

library(tidyverse)
fitness_full <- read_csv("https://raw.githubusercontent.com/highamm/ds234_quarto/main/data_online/higham_fitness_clean.csv",
                         col_types = list(stepgoal = col_factor())) |>
  mutate(weekend_ind = case_when(weekday == "Sat" | weekday == "Sun" ~ "weekend",
                                 .default = "weekday"))

ggplot(data = fitness_full, aes(x = steps, y = flights)) +
  geom_point()

ggplot(data = stat113_df, aes(x = Wgt, y = Hgt, colour = Sex)) + 
  geom_point()
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = Wgt, y = Hgt, shape = Sex)) +
  geom_point()
Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = Wgt, y = Hgt, size = Exercise)) +
  geom_point()
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = Wgt, y = Hgt)) +
  geom_point() +
  facet_wrap(~ Year)
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = fitness_full, aes(x = steps, y = active_cals)) +
  geom_point() +
  facet_wrap(~ month)

Section 2.4

Class Exercise 1

stat113_restructured <- stat113_df |> group_by(SocialMedia) |>
  summarise(n_social = n())
stat113_restructured
# A tibble: 6 × 2
  SocialMedia n_social
  <chr>          <int>
1 Facebook          13
2 Instagram        173
3 None               3
4 Other             16
5 Snapchat         171
6 Twitter           21
ggplot(data = stat113_restructured, aes(x = SocialMedia, y = n_social)) +
  geom_col()

Class Exercise 2

method = “lm” makes a best fit line

span sets how wiggly line is

ggplot(data = fitness_full, aes(x = distance, y = active_cals)) +
  geom_point() + geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Class Exercise 3

ggplot(data = fitness_full, aes(x = dayofyear, y = steps)) +
  geom_line()

Class Exercise 4

ggplot(data = stat113_df, aes(x = Exercise, y = GPA)) +
  geom_point() +
  facet_wrap(~ Sex + Sport)
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = Exercise, y = GPA)) +
  geom_point() +
  facet_grid(Sex ~ Sport)
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_point()`).

Class Exercise 5

ggplot(data = stat113_df, aes(x = Award, y = Exercise)) +
  geom_boxplot(outlier.shape = 8)
Warning: Removed 7 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Your Turn 2

ggplot(data = stat113_df, aes(x = Wgt, y = Pulse)) + 
  geom_point() +
  facet_wrap(~ Sport)
Warning: Removed 41 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = Wgt, y = Pulse, colour = Sport)) + 
  geom_point()
Warning: Removed 41 rows containing missing values or values outside the scale range
(`geom_point()`).

Your Turn 3

ggplot(data = stat113_df, aes(x = GPA, y = Exercise)) +
  geom_point(aes(colour = Year)) +
  facet_wrap(~ Year)
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = GPA, y = Exercise, colour = Year)) +
  geom_point()
Warning: Removed 73 rows containing missing values or values outside the scale range
(`geom_point()`).

## in general, colour is more useful for variables that have fewer levels

Your Turn 4

ggplot(data = fitness_full, aes(x = distance)) +
  geom_freqpoly(aes(colour = weekday))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = fitness_full, aes(x = distance)) +
  geom_histogram() +
  facet_wrap(~ weekday)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Your Turn 5

Line plots are useful if there is one value for the y variable for each distinct value of the x variable. For the stat 113 data there would be more than one exercise value for each unique GPA.

Section 2.5: Aesthetic Mapping in ggplot

library(tidyverse)
fitness_full <- read_csv("https://raw.githubusercontent.com/highamm/ds234_quarto/main/data_online/higham_fitness_clean.csv",
                         col_types = list(stepgoal = col_factor())) |>
  mutate(weekend_ind = case_when(weekday == "Sat" | weekday == "Sun" ~ "weekend",
                                 .default = "weekday"))

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals, colour = stepgoal)) +
  geom_point()

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals, colour = "blue")) +
  geom_point()

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals)) +
  geom_point(colour = "blue")

Exercise 1

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals)) +
  geom_point(size = 2, shape = 4)

Exercise 2

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals)) +
  geom_point() +
  geom_smooth(linewidth = 2)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

2.5.2

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals, colour = stepgoal)) +
  geom_point() +
  geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(data = fitness_full) +
  geom_point(aes(x = Start, y = active_cals, colour = stepgoal)) +
  geom_smooth(aes(x = Start, y = active_cals, colour = stepgoal))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals)) +
  geom_point(aes(colour = stepgoal)) +
  geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(data = fitness_full,
       aes(x = Start, y = active_cals)) +
  geom_point() +
  geom_smooth(aes(colour = stepgoal))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Exercise 3

ggplot(data = fitness_full) +
  geom_point(aes(x = Start, y = active_cals)) +
  geom_line(aes(x = Start, y = active_cals))

Exercise 4

ggplot(data =fitness_full, aes(x =weekday,y= steps)) +   
  geom_boxplot(fill="coral1",colour ="black")

Exercise 5

  1. Statistical Modelling or Communication
  2. Getting it into a consistent form that makes the rest of the analysis easier
  3. They took into account the strengths and weakness of being human as opposed to optimizing it for computers to do everything
  4. Importing and tidying up the data are the two first steps. These are followed by a cycle that can repeat the three steps of transform, visualize and model as many time as necessary before the information is communicated in some form.

2.6

Class Exercise 1

ggplot(data = stat113_df, aes(x = Wgt, y = Hgt, colour = Sport)) +
  geom_point() +
  geom_smooth(se = FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 16 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: span too small.  fewer data values than degrees of freedom.
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at 114.65
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 25.35
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 0
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 2056.6
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: Chernobyl! trL>n 5
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: Chernobyl! trL>n 5
Warning in sqrt(sum.squares/one.delta): NaNs produced
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data = stat113_df, aes(x = Wgt, y = Hgt)) +
  geom_point(aes(colour = Sport)) +
  geom_smooth(se = FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 16 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

Class Exercise 2

ggplot(data = stat113_df, aes(x = TV)) +
  geom_histogram(colour = "black", fill = "white")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_bin()`).

Your Turn 1

  1. “lm” is not a variable in the data set so it should not be inside an aes() function
  2. This also works. The local option for colour will override the global specification.
  3. colour = “purple” is not in our dataset so it should not be inside of an aes() function; colour = groupvar should be inside of an aes() function as it is in the data set
  4. colour = groupvar should be in an aes() function because it is a part of our data set
  5. This is the correct way to get the specified plot
  6. colour = “purple” should not be in an aes() function because it is not a variable in the data set
ggplot(data = stat113_df, aes(x = Wgt, y = Hgt, colour = Sex)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_colour_manual(values = c("purple", "blue"))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 16 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

Your Turn 2

global aesthetics are more useful if they are used by every subsequent GEOM

ggplot(data = stat113_df) +
  geom_point(colour = "purple", aes(x = Wgt, y = Hgt)) +
  geom_smooth(method = "lm", aes(x = Wgt, y = Hgt, colour = Sex))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 16 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).